imports required to unpack datasets, load DDS (doing data science) data sets from a set of csv files

In [ ]:
import os
import zipfile
from metrique.core_api import PandasClient

def xall(path):
    z = zipfile.ZipFile(os.path.expanduser(path))

In [ ]:
#!mkdir ~/.metrique/repos

In [ ]:
%cd ~/.metrique/repos

Clone the metrique git repo; install metrique

In [ ]:
!git clone

Clone the oreilly doing data science sample dataset git repo; Unpack the dataset and load in

In [ ]:
!git clone

In [ ]:
if not os.path.exists('nyt1.csv'):
    xall('doing_data_science/')  # extracts various doing data science datasets
    xall('dds_datasets/')  # extracts the nyt*.csv's

In [ ]:
z = PandasClient()

load up the datasets

In [25]:
# globs accepted; the single ? only samples the first 
# 10 files; takes 10s+
%time nyt = z.load('./nyt?.csv')

CPU times: user 10.1 s, sys: 1.48 s, total: 11.6 s
Wall time: 11.7 s

In [26]:
%time ch5_binary = z.load('./dds_datasets/dds_ch5_binary-class-dataset.txt', sep='\t')

CPU times: user 103 ms, sys: 0 ns, total: 103 ms
Wall time: 104 ms

run pandas analysis

In [27]:

<matplotlib.axes.AxesSubplot at 0xa5818d0>

In [28]:

<matplotlib.axes.AxesSubplot at 0xa9a4ad0>

In [28]:

In [ ]: